STUtility || 空间转录组多样本分析框架(二)
作者 | 周运来
男,
一个长大了才会遇到的帅哥,
稳健,潇洒,大方,靠谱。
一段生信缘,一棵技能树。
生信技能树核心成员,单细胞天地特约撰稿人,简书创作者,单细胞数据科学家。
我们在上一篇文章 STUtility || 空间转录组多样本分析框架(一)中演示了用STUtility分析空转多样本,主要是对空间信息和图像信息的分析,可以说凸显了空转应有的特性。在这里,我们将探讨:
空转数据和单转数据的相似性 用Seurat对空转数据的标准分析 感兴趣区域的边缘检测
同样地,我们载入数据加载R包,并执行Seurat的标准化:
library(Matrix)
library(magrittr)
library(dplyr)
library(ggplot2)
library(spdep)
se <- readRDS('se.rds')
# 我们可以去除一下切片的批次,但是内存太小,只能放弃了。
# Add a section column to your meta.data
#se$section <- paste0("section_", GetStaffli(se)[[, "sample", drop = T]])
# Run normalization with "vars.to.regress" option
?SCTransform
# se.batch.cor <- SCTransform(se, vars.to.regress = "section", variable.features.n=1000,conserve.memory=T)
se<- SCTransform(se)
# 空间高变基因
spatgenes <- CorSpatialGenes(se)
head(spatgenes)
gene cor
CRISP3 CRISP3 0.9636610
RPL17 RPL17 0.9379202
CXCL14 CXCL14 0.9371324
MGP MGP 0.9361545
CRIP1 CRIP1 0.9333115
NME2 NME2 0.9292425
heatmap.colors <- c("dark blue", "cyan", "yellow", "red", "dark red")
ST.FeaturePlot(se, features = c('CRISP3',"CXCL14",'CRIP1','MGP'), cols = heatmap.colors, dark.theme = T)
空转数据和单转数据的相似性
STUtility 的分析框架是继承Seurat的方法,那么我们不禁要问,目前的空间表达数据和单细胞转录组数据到底有多相似呢?拿pbmc3k数据比较一下。
# Get raw count data
umi_data <- GetAssayData(object = se, slot = "counts", assay = "RNA")
dim(umi_data)
umi_data[1:4,1:4]
# Calculate gene attributes
gene_attr <- data.frame(mean = rowMeans(umi_data),
detection_rate = rowMeans(umi_data > 0),
var = apply(umi_data, 1, var),
row.names = rownames(umi_data)) %>%
mutate(log_mean = log10(mean), log_var = log10(var))
head(gene_attr)
mean detection_rate var log_mean log_var
1 0.02564760 0.02513465 0.02601904 -1.590953 -1.5847087
2 0.03552193 0.03372660 0.03785564 -1.449503 -1.4218694
3 0.04334445 0.04142088 0.04531866 -1.363067 -1.3437230
4 0.02436522 0.02333932 0.02582668 -1.613230 -1.5879315
5 0.03500898 0.03270069 0.03840484 -1.455821 -1.4156140
6 0.07989228 0.06758143 0.10506953 -1.097495 -0.9785232
# Obtain spot attributes from Seurat meta.data slot
spot_attr <- se[[c("nFeature_RNA", "nCount_RNA")]]
# Get pbm raw count data
library(pbmc3k.SeuratData,lib.loc = 'E:\\software\\R\\R-4.0.2\\library')
AvailableData()
pbmc3kumi_data <- GetAssayData(object = pbmc3k.final, slot = "counts", assay = "RNA")
dim(pbmc3kumi_data)
# Calculate gene attributes
pbmc3kgene_attr <- data.frame(mean = rowMeans(pbmc3kumi_data),
detection_rate = rowMeans(pbmc3kumi_data > 0),
var = apply(pbmc3kumi_data, 1, var),
row.names = rownames(pbmc3kumi_data)) %>%
mutate(log_mean = log10(mean), log_var = log10(var))
绘制两种数据的分布:
p1 <- ggplot(gene_attr, aes(log_mean, log_var)) +
geom_point(alpha = 0.3, shape = 16, color = "white") +
geom_density_2d(size = 0.3) +
geom_abline(intercept = 0, slope = 1, color = 'red') +
ggtitle("Mean-variance relationship") + DarkTheme()
# add the expected detection rate under Poisson model
x = seq(from = -2, to = 2, length.out = 1000)
poisson_model <- data.frame(log_mean = x, detection_rate = 1 - dpois(0, lambda = 10^x))
p2 <- ggplot(gene_attr, aes(log_mean, detection_rate)) +
geom_point(alpha = 0.3, shape = 16, color = "white") +
geom_line(data = poisson_model, color='red') +
ggtitle("Mean-detection-rate relationship") + DarkTheme()
p3 <- ggplot(pbmc3kgene_attr, aes(log_mean, log_var)) +
geom_point(alpha = 0.3, shape = 16, color = "white") +
geom_density_2d(size = 0.3) +
geom_abline(intercept = 0, slope = 1, color = 'red') +
ggtitle("PBMC3k\n Mean-variance relationship") + DarkTheme()
# add the expected detection rate under Poisson model
x = seq(from = -2, to = 2, length.out = 1000)
poisson_model <- data.frame(log_mean = x, detection_rate = 1 - dpois(0, lambda = 10^x))
p4 <- ggplot(pbmc3kgene_attr, aes(log_mean, detection_rate)) +
geom_point(alpha = 0.3, shape = 16, color = "white") +
geom_line(data = poisson_model, color='red') +
ggtitle("PBMC3k\n Mean-detection-rate relationship") + DarkTheme()
cowplot::plot_grid(p1, p2,p3,p4, nrow = 2)
我们看到二种数据的分布还是很相似的,这也解释了为什么大部分的单细胞转录组的分析方法在空转中都是可以应用的。接下来,我们检查空转数据是否为稀疏矩阵,也就是零值有多少。
sum(pbmc3kumi_data == 0)/(ncol(pbmc3kumi_data)*nrow(pbmc3kumi_data))
[1] 0.9381182
sum(umi_data == 0)/(ncol(umi_data)*nrow(umi_data))
[1] 0.6779447
sum(umi_data == 0) / prod(dim(umi_data))
[1] 0.6779447
print(object.size(pbmc3kumi_data), units = "Mb")
26.7 Mb
print(object.size(umi_data), units = "Mb")
469.4 Mb
Seurat对空转数据的标准分析
可见在空转中零值也是很多的,虽然没有单转那么多。接下来进行表达数据的降维聚类分析。鉴于这一部分和单转太相似了,我们就不再每一步地解释了。
?RunNMF # 特征选择类似PCA
se <- RunNMF(se, nfactors = 40) # Specificy nfactors to choose the number of factors, default=20.
cscale <- c("darkblue", "cyan", "yellow", "red", "darkred")
ST.DimPlot(se,
dims = 1:2,
ncol = 2, # Sets the number of columns at dimensions level
grid.ncol = 2, # Sets the number of columns at sample level
reduction = "NMF",
dark.theme = T,
pt.size = 1,
center.zero = F,
cols = cscale)
print(se[["NMF"]])
分群并绘制分群信息:
FactorGeneLoadingPlot(se, factor = 1, dark.theme = TRUE)
se <- FindNeighbors(object = se, verbose = FALSE, reduction = "NMF", dims = 1:40)
se <- FindClusters(object = se, verbose = FALSE)
library(RColorBrewer)
n <- 19
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
ST.FeaturePlot(object = se, features = "seurat_clusters", dark.theme = T, cols = col_vector, pt.size = 2,sb.size = 5 )
head(se@assays$SCT@var.features, 20)
[1] "IGLC2" "IGHG3" "IGKC" "ALB" "IGLC3" "IGHG1" "IGHM" "IGLC1" "IGHA1" "MGP" "CRISP3" "IGHG2"
[13] "IGHG4" "CPB1" "IGHA2" "SERPINA3" "JCHAIN" "CXCL14" "CCL21" "CCL19"
top <- se@assays$SCT@var.features
FeatureOverlay(se,
features = c('IGLC2','CXCL14'),
sampleids = 1:2,
cols = c("darkblue", "cyan", "yellow", "red", "darkred"),
pt.size = 1.5,
pt.alpha = 0.5,
dark.theme = T,
ncols.samples = 2)
我们可以在UMAP空间上看spot的分群情况,当然,之前在Seurat中对分群做的一切动作这里都是可以做的,如split和group等。
se <- RunUMAP(se, reduction = "NMF", dims = 1:40, n.neighbors = 10)
DimPlot(object = se, reduction = "umap", pt.size = 1.5,ncol = 2)+ DarkTheme()
STUtility 还提供用三原色可视化分群信息的选项,这样可视化更加清楚。
se <- RunUMAP(object = se, dims = 1:40, verbose = FALSE, n.components = 3, reduction = "NMF", reduction.name = "umap.3d")
ST.DimPlot(object = se, dims = 1:3, reduction = "umap.3d", blend = T, dark.theme = T, pt.size = 1.5)
分群之后,我们要做的就是看每个群的特征,比较直接的生物学意义就是做差异分析:
markers <- FindMarkers(se, ident.1 = "12")
head(markers)
p_val avg_logFC pct.1 pct.2 p_val_adj
AC026355.1 7.800313e-211 0.6957137 0.799 0.165 1.269345e-206
U62317.5 6.955878e-192 0.4293024 0.454 0.053 1.131930e-187
CPB1 1.666918e-171 1.4851863 1.000 0.959 2.712576e-167
FCGR3B 4.019841e-160 1.1899326 0.883 0.335 6.541488e-156
IL6ST 5.819598e-158 0.6636229 1.000 0.998 9.470232e-154
SCUBE3 1.797465e-157 0.7575323 0.862 0.288 2.925015e-153
可视化marker基因:
FeatureOverlay(se,
features = c('IL6ST'),
sampleids = 1:2,
cols = c("darkblue", "cyan", "yellow", "red", "darkred"),
pt.size = 1.5,
pt.alpha = 0.5,
dark.theme = T,
ncols.samples = 2)
边缘检测
有时提取一一亚群点的“邻域”是有用的。,我们展示了如何应用这个方法来找到任何感兴趣区域的所有邻近点。
一旦定义好了一个感兴趣的区域(region of interest,ROI),并且希望找到与该区域相邻的所有点,您可以使用regionneighbors函数来自动检测这些点。例如,假设我们想要选择所有cluster 2 的边缘spot。第一步是确保seurat对象的标识是正确的,这里我们需要将其设置为“seurat_clusters”。当然, 如果是注释好的,那就更有生物学意义了。
?RegionNeighbours
st <- SetIdent(st, value = "seurat_clusters")
st <- RegionNeighbours(st, id = "2", verbose = TRUE)
FeatureOverlay(st, features = "nbs_2", ncols.samples = 2, sampleids = 1:2, cols = c("red", "lightgray"), pt.size = 2, dark.theme = T)
检测到亚群的边缘之后,肯定是要看看边缘和内部有什么差异了,最直观的就是做一下差异基因看看:
library(magrittr)
library(dplyr)
se <- SetIdent(se, value = "nbs_2")
nbs_2.markers <- FindMarkers(se, ident.1 = "2", ident.2 = "nbs_2")
nbs_2.markers$gene <- rownames(nbs_2.markers)
st.subset <- SubsetSTData(se, expression = nbs_2 %in% c("2", "nbs_2"))
sorted.marks <- nbs_2.markers %>% top_n(n = 40, wt = abs(avg_logFC))
sorted.marks <- sorted.marks[order(sorted.marks$avg_logFC, decreasing = T), ]
DoHeatmap(st.subset, features = sorted.marks$gene, group.colors = c("red", "lightgray"), disp.min = -2, disp.max = 2) + DarkTheme()
FeatureOverlay(st.subset, features =sorted.marks$gene[1:4], pt.size = 2, dark.theme = T,
ncols.features = 2, cols = c("darkblue", "cyan", "yellow", "red", "darkred"))
(https://en.wikipedia.org/wiki/Region_of_interest) 感兴趣的区域(通常缩写为ROI)是为特定目确定的数据集中的样本。ROI的概念在许多应用领域中普遍使用。例如,在医学成像中,为了测量肿瘤的大小,肿瘤的边界可以在图像或体积上确定。我们将会在下一篇文章中讲述如何选择感兴趣的区域。
如果你对单细胞转录组研究感兴趣,但又不知道如何入门,也许你可以关注一下下面的课程
看完记得顺手点个“在看”哦!
长按扫码可关注